175904 - Jorge III Altamirano Astorga

177508 - Uriel Miranda Miñón

Load

Se cargan los datos originales en formato JSON y se separan datos de entrenamiento y validación; existe además forma de datos de prueba adicionales en la prueba de Kaggle. :-)

set.seed(175904)
train_raw <- fromJSON("data/train.json") 
train_raw <- train_raw %>% as.data.frame
train_raw$train <- F
train_raw[sample(nrow(train_raw), nrow(train_raw)*.7, replace = F), 4] <- T
train <- train_raw[train_raw$train == T,1:3]
valid <- train_raw[train_raw$train == F,1:3]
train_2 <- train
valid_2 <- valid

ggplot(train_raw, aes(x=train)) +
  geom_histogram(stat="count")
## Warning: Ignoring unknown parameters: binwidth, bins, pad

Cleaning

Se limpian los datos de palabras que consideramos que no añaden valor a la predicción. De los cuales consideramos

  • Gentilicios: mexican, italian, greek, parma, …
  • Onzas, small, warm, hot, rich, …
  • Formas: minced, cubed, smoked, unflavored, unsalted, light, heavy, low-fat, …
  • Variantes: toasted, young, fresh, …
  • Marcas: Oscar Mayer, Kraft, Velveeta, Everglades, …

Así mismo se condensaron los datos por palabras clave, los cuales consideramos ordinales en importancia:

  1. Vegetales: beans, lettuce, potato(es), lemon, …
  2. Derivados animales: tipos de queso, tipos de embutido, huevos, leche de coco, crema, …
  3. Proteínas: stock, beef, pork, chicken, oyster
  4. Especias: sal, curry, ajo, …

Ejemplo de condensación de datos:

  • Lemon zest -> lemon
  • roasted beef -> beef
  • chicken broth -> chicken

Todo esto derivado de estudiar el dataset con el fin de reducir el número de variables.

Por último sacamos las frecuencias y eliminamos los datos con \(n\) ocurrencias, en nuestro caso determinamos los suficientes para tener alrededor de 100 variables.

Limpieza

unimportant_words <- "(^a taste of|any|low-fat|low ?salt|all|powder|baby|bertolli|boiled|boiling|bone in|whole|boneless|bottled|sauvignon|california|campbells condensed|canned|chopped|flavored carbonated beverage|cold|condensed|cooked|cooking|cereal|lowfat|frosting|spread|soften|with chives.*|ic peach| of .*|creamed|creamy| mexican.?|crushed|crystal farms|shredded|crystallized|crystal hot|cubed|curly|curlyleaf|jelly|dessert mix|sauce mix|mix|dark|dellallo|deepfried|deep fried|diced|diet|tortilla chips||domino|dried|minced|dry|earth balance|elmlea|^english|evaporated|everglades|extra fine|extra firm|extra large|extra\\s?lean|extra light|extra sharp|extra\\s?virgin|extra wide|^fat|fat\\s?free|fat skimmed|fatfree?|fattrimmed|fine|firm|firmly packed|flat |^flavored|terrine|food|free\\s?range|^french|^fresh| root|^fresno|^fried|^frozen|^fuji|full\\s?fat|gluten\\s?free|s milk |^gold|golden|gourmet|graham|granulated|grassfed|grated|grating|gravenstein|great|greater|style|green|grilled|grill|ground|half|heavy|heinz|hellmanns?|of the woods|herbed |herdez|hidden valley|homemade|^hot |hot smoked|hot spanish|^hungarian|hurst|i cantbelieve? its? not butter|imitation|imperial sugar light brown|instant |^irish|^italian|italianstyle|^japanese |jimmy dean|johnsonville|jose cuervo|jumbo|kikkoman|knorr|knudsen|kraft|mexican style|kraft zesty|slim cut|sun dried|shredded|la victoria|land o lakes|^large|^lean|leftover|leg of|zest|less sodium|lesser|leaves|^light|cook and drain|lipton|liquid |^lite|^long |loosely packed fresh|low fat|lowfat|^low sodium|lower sodium|lowfat|baked|\\sdeli|firm silken|styl|lowsodium|and cheese dinner|madagascar bourbon|extract|mccormick|^medium|uncook|uncooked|merguez|^mexican|minced|mini|mini|mixed|mixture|mizkan|^mms|mrs dash|natural|^nido|non dairy|non fat|non stick|nondairy|nonfat|frozen|nonhydrogenated|nosaltadded|old el paso|old|old\\s?fashioned|cooking spray|flavored|^organic|oscar mayer|other|oven\\s?ready|flavor|flavour|paella|reggiano|peeled|^petite|pillsbury|powdered|prepared|preserv|preserved|progresso|\\sdi\\sparma|pt|pte|puff|puffed|pure|quickcooking|quick|cooking|raw|red|reduced\\sfat|reduced\\ssodium|reduced\\ssugar|reducedfat|reducedsodium|reducedsugar|refrigerated|regular|rich|roasted|roast|roasting|robert mondavi|salt free seasoning|salt free chili powder|salt free cajun creole seasoning|salt free southwest chipotle seasoning|salt free herb seasoning|salt free chili powder|salted|saltines?|saltpeper|san marazano|sargento|links|casings|savoy|seafood|seasoned|seasoning|seedless|self ?ra?ising|shredded|single|simple|skinless|sliced|small|smoked|sodium free|sodium reduced|soft|softened|solid|southern comfort.*|southwest|sparkling|spicy|splenda.*|split|spring water|^strip|superfine|sweetened|taco bell.*|into serving pieces|to\\s+lb|toasted|uncle bens|^uncook|^uncooked|unflavou?red|unsweetened|white|wholesome sweeteners|wholemilk|wide|^wild|^winter|wish\\s?bone|yellow|young|zesty|part ?skim|italian|all ?purpose|puree|juice|aged|tuna in water|liqueur|liquor|^asian|and .*|yoplait|greek|fresh|spray|hot water|warm water|crumbles|freshly|flakes?|unsalt|unsalted|wedges?|plain)(\\s|$)"
popular_words <- function(a){
  a <- gsub("(.*)(beans?|lettuce|olives?|tabasco|potato(es)?|cilantro|wheat|shiitake|lemon|chives?|tomato(es)?|cabbage|peanut|yogh?o?urt|rice|onions?|ginger|sesame|jalapeno|stock|bacon|monterey_jack|vinegar|sausages?|mozz?arell?a|monterey_jack|feta|ricotta|dijon|masala|eggs?|coconut_milk|cheddar|dijon|parmesan|sour_(crema|cream)|steak|pork|beef|chicken|oyster|garlic|salt|curry).*", "\\2", a)
}

#limpiar datos de ingredientes
train_2$ingredients <- sapply(1:nrow(train), function(x) {
  train[x,3] %>% 
    unlist %>%
    tolower %>%
    gsub("\\([^)]*\\)", "", ., perl = T, ignore.case = T) %>%
    gsub("[^ a-z]", "", ., perl = T, ignore.case = T) %>%
    gsub("^\\s+", " ", ., perl = T, ignore.case = T) %>%
    gsub(unimportant_words, " ", ., perl = T, ignore.case = T) %>%
    gsub("\\s+", " ", ., perl = T, ignore.case = T) %>%
    unique %>%
    trimws %>%
    gsub("\\s", "_", ., perl = T, ignore.case = T) %>%
    popular_words
})

#limpiar los datos de validación
valid_2$ingredients <- sapply(1:nrow(valid), function(x) {
  valid[x,3] %>% 
    unlist %>%
    tolower %>%
    gsub("\\([^)]*\\)", "", ., perl = T, ignore.case = T) %>%
    gsub("[^ a-z]", "", ., perl = T, ignore.case = T) %>%
    gsub("^\\s+", " ", ., perl = T, ignore.case = T) %>%
    gsub(unimportant_words, " ", ., perl = T, ignore.case = T) %>%
    gsub("\\s+", " ", ., perl = T, ignore.case = T) %>%
    unique %>%
    trimws %>%
    gsub("\\s", "_", ., perl = T, ignore.case = T) %>%
    popular_words
})

train_2$cuisine <- train_2$cuisine %>% as.factor
valid$cuisine <- valid$cuisine %>% as.factor

ingredients <- train_2$ingredients %>% 
  unlist 
ingredients <- ingredients[which(!grepl(pattern = "^$", x = ingredients))] #%>%
#    unique
ingredients %>% head(n=10)
##  [1] "lettuce"  "olives"   "tomatoes" "garlic"   "pepper"   "onion"   
##  [7] "beans"    "feta"     "flour"    "pepper"
ingredients_df <- as.data.frame(ingredients, stringsAsFactors = F) %>%
  group_by(ingredients)

ingredients_count <- ingredients_df %>% 
  plyr::count(.) %>%
  arrange(freq)
ingredients_count$id <- 1:nrow(ingredients_count)
head(ingredients_count)
tail(ingredients_count)
ingredients_top <- ingredients_count %>% 
  filter(freq > 150) %>% 
  arrange(desc(freq)) %>% 
  select(ingredients)

Variables

train_3 <- train_2[,1:2]
valid_3 <- valid_2[,1:2]
#crea las columnas
train_3[, ingredients_top %>% unlist] <- 0
valid_3[, ingredients_top %>% unlist] <- 0
#llena las columnas
for(i in 1:nrow(train_3)){
  train_3[i, which(names(train_3) %in% (train_2[i,]$ingredients %>% unlist))] <- 1
}
for(i in 1:nrow(valid_3)){
  valid_3[i, which(names(valid_3) %in% (valid_2[i,]$ingredients %>% unlist))] <- 1
}
rm(i)
saveRDS(train_2, "data/train_2.rds")
saveRDS(train_3, "data/train_3.rds")
saveRDS(valid_2, "data/valid_2.rds")
saveRDS(valid_3, "data/valid_3.rds")
# train_2 <- readRDS("data/train_2.rds")
# train_3 <- readRDS("data/train_3.rds")
# valid_2 <- readRDS("data/valid_2.rds")
# valid_3 <- readRDS("data/valid_3.rds")
train_3 %>% 
  select(id,cuisine,garlic,salt,pepper) %>%
  head(n=5)
valid_3 %>% 
  select(id,cuisine,garlic,salt,pepper) %>%
  head(n=5)

EDA

Frecuencias de ingredientes

ggplot(ingredients_count %>% filter(freq > 100),aes(x=id, y=log(freq)))+geom_line() + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) 

Número de ingredientes por tipo de cocina

n_ing <- train_2$ingredients
# a_ing <- 
train_2$n_ing <- sapply(1:length(n_ing),function(i){length(n_ing[[i]])})
ggplot(train_2 , aes(x=cuisine,y=n_ing)) +
  geom_boxplot() +
  theme(axis.text.x=element_text(angle=90,hjust=1))

rm(n_ing)

Stats

data.frame( Cuisine = unique(train_2$cuisine), 
                       Mean = aggregate(train_2$n_ing, list(train_2$cuisine), mean)[,2],
                       SD = aggregate(train_2$n_ing, list(train_2$cuisine), sd)[,2],
                       Min = aggregate(train_2$n_ing, list(train_2$cuisine), min)[,2],
                       Max = aggregate(train_2$n_ing, list(train_2$cuisine), max)[,2])

Frecuencias de ingredientes por cocina

ggplot(train_2, aes(n_ing, group = cuisine)) + 
    geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + 
    scale_y_continuous(labels=scales::percent) +
    ylab("relative frequencies") +
    facet_wrap(~cuisine)

Casos por cocina

Vemos que la cocina mexicana es una cocina muy popular. :-)

ggplot((train_2$cuisine %>% plyr::count()), aes(x = reorder(x, -freq), y = freq)) +
  geom_bar(stat = "identity") +
  xlab("Cuisine") +
  ylab("Frecuencias") +
  theme(axis.text.x=element_text(angle=90,hjust=1))

Ingredientes por cocina

train_2 <- readRDS("data/train_2.rds")
ingredients_count <- readRDS("data/ingredients_count.Rdata")
ingredients_graph <- train_2 %>% 
  unnest

top_ing <- ingredients_count[(nrow(ingredients_count)-100):nrow(ingredients_count),]

ggplot(ingredients_graph[ingredients_graph$ingredients %in% top_ing$ingredients,], aes(x = ingredients, fill = cuisine)) + geom_histogram(stat = "count" ) + theme(axis.text.x=element_text(angle=90,hjust=1))
## Warning: Ignoring unknown parameters: binwidth, bins, pad

Models

Árbol

Predice súmamente mal, incluso con el conjunto de entrenamiento. Debido a esto, exploramos otros algoritmos.

# arbol_grande <- rpart(cuisine ~ ., data= train_3[,-1], cp=0)
# save(arbol_grande, file = "data/arbol.Rdata")
load(file = "data/arbol.Rdata")
prp(prune(arbol_grande, cp=0.03), type=4, extra=1, digits=3) 

train_3$arbol <- predict(arbol_grande, newdata = train_3[,3:210], type="class")
train_3 %>% mutate(arbol_pred = (cuisine == arbol)) %>% select(arbol_pred) %>% summary
##  arbol_pred     
##  Mode :logical  
##  FALSE:10294    
##  TRUE :17547
train_3 %>% select(cuisine, arbol)

Bosques Aleatorios

# bosque <- foreach(ntree=rep(150, 3), .combine=combine, .multicombine=TRUE,
#               .packages='randomForest') %dopar% {
#     randomForest(cuisine ~ . , data = train_3[,2:210], ntree=ntree)
#               }
# save(bosque, file="data/bosque.Rdata")
load("data/bosque.Rdata")

Entrenamiento

train_3$bosque <- predict(bosque, newdata = train_3[,3:210], type="class")
train_3 %>% mutate(bosque_pred = (cuisine == bosque)) %>% select(bosque_pred) %>% summary
##  bosque_pred    
##  Mode :logical  
##  FALSE:2681     
##  TRUE :25160
train_3 %>% select(cuisine, bosque)

Validación

load("data/bosque.Rdata")
valid_3$bosque <- predict(bosque, newdata = valid_3[,3:210], type="class")
valid_3 %>% mutate(bosque_pred = (cuisine == bosque)) %>% select(bosque_pred) %>% summary
##  bosque_pred    
##  Mode :logical  
##  FALSE:3621     
##  TRUE :8312
valid_3 %>% select(cuisine, bosque)

SVM

# set.seed(175904)
# svm <- parallelSVM(cuisine ~ . , data = train_3[,2:210], 
#             numberCores = detectCores()-1,
#             samplingSize = 0.2, 
#             na.action = na.omit, 
#             scale = TRUE)
# save(svm, file = "data/svm.Rdata")
load(file = "data/svm.Rdata")

Entrenamiento

train_3$svm <- predict(svm, newdata = train_3[,3:210], type="class")
train_3 %>% mutate(svm_pred = (cuisine == svm)) %>% select(svm_pred) %>% summary
##   svm_pred      
##  Mode :logical  
##  FALSE:8666     
##  TRUE :19175
train_3 %>% select(cuisine, svm)

Validación

# valid_3 <- readRDS("data/valid_3.rds")
# load(file = "data/svm.Rdata")
valid_3$svm <- predict(svm, newdata = valid_3[,3:210], type="class")
valid_3$bosque <- predict(bosque, newdata = valid_3[,3:210], type="class")
valid_3 %>% mutate(svm_pred = (cuisine == svm)) %>% select(svm_pred) %>% summary
##   svm_pred      
##  Mode :logical  
##  FALSE:4050     
##  TRUE :7883
valid_3 %>% select(cuisine, svm)

Comparación

valid_3a <- valid_3%>%select(cuisine,svm,bosque)
valid_3a <- valid_3a %>% group_by(cuisine) %>% mutate("bosque" = (cuisine == bosque))
valid_3a <- valid_3a %>% group_by(cuisine) %>% mutate("svm" = (cuisine == svm))
data_valid <- summarise(valid_3a, 
                        bosque=mean(bosque), svm=mean(svm))
g1 <- ggplot(data_valid , aes(x=reorder(cuisine, -bosque), y=bosque)) + 
  geom_bar(stat="identity") + xlab("cuisine") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) 
g2 <- ggplot(data_valid , aes(x=reorder(cuisine, -svm), y=svm)) + 
  geom_bar(stat="identity") + xlab("cuisine") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))
grid.arrange(g1,g2)

glimpse(valid_3)
## Observations: 11,933
## Variables: 212
## $ id                   <int> 20130, 22213, 42779, 3735, 45887, 2698, 1...
## $ cuisine              <chr> "filipino", "indian", "spanish", "italian...
## $ salt                 <dbl> 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0,...
## $ garlic               <dbl> 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0,...
## $ onions               <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,...
## $ olive                <dbl> 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,...
## $ chicken              <dbl> 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0,...
## $ pepper               <dbl> 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0,...
## $ sugar                <dbl> 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,...
## $ tomatoes             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,...
## $ water                <dbl> 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ butter               <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,...
## $ black_pepper         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ onion                <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ eggs                 <dbl> 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,...
## $ flour                <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ lemon                <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ cilantro             <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ginger               <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ vinegar              <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ rice                 <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,...
## $ vegetable_oil        <dbl> 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,...
## $ soy_sauce            <dbl> 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,...
## $ lime                 <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,...
## $ milk                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ beans                <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,...
## $ cumin                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ sesame               <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,...
## $ bell_pepper          <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,...
## $ egg                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ carrots              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ parmesan             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ beef                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ parsley              <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,...
## $ oregano              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ basil                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ chili                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ tomato               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ potatoes             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,...
## $ brown_sugar          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,...
## $ oil                  <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ pork                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ wine                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ shrimp               <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,...
## $ thyme                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ stock                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,...
## $ sour_cream           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ cream                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ cinnamon             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ cheddar              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ scallions            <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ baking               <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ vanilla              <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ jalapeno             <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ corn_starch          <dbl> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,...
## $ peanut               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...
## $ coriander            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ cayenne_pepper       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,...
## $ paprika              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ curry                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ shallots             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ fish_sauce           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ mozzarella           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,...
## $ olives               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ celery               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ honey                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ spinach              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ avocado              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ cabbage              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...
## $ coconut_milk         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ canola_oil           <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ bacon                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ lettuce              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ sausage              <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,...
## $ mushrooms            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ bay                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,...
## $ mint                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ orange               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ yogurt               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ nutmeg               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ flat_leaf_parsley    <dbl> 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ masala               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ corn_tortillas       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ buttermilk           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ cucumber             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ salsa                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ baking_soda          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ steak                <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...
## $ flour_tortillas      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ cumin_seed           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ turmeric             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ peas                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ chilies              <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ whipping_cream       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ zucchini             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ bay_leaf             <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ cheese               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ mayonaise            <dbl> 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,...
## $ cream_cheese         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ tumeric              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ rosemary             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ bread_crumbs         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ corn                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ raisins              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ricotta              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ wheat                <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ worcestershire_sauce <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,...
## $ hot_sauce            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ chili_peppers        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ cinnamon_sticks      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ feta                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ chives               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...
## $ bread                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ monterey_jack        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ clove                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ shiitake             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ celery_ribs          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ oyster               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...
## $ dijon                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ tofu                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ capers               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ bean                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ cornmeal             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ allspice             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ almonds              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ mirin                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ taco                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ chile                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,...
## $ cayenne              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ sauce                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ vegetable_broth      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ black_peppercorns    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ coconut              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ pecans               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ mustard              <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ cloves               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ketchup              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ hoisin_sauce         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ cracked_black_pepper <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ cardamom             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ corn_kernels         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ confectioners_sugar  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ potato               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ sage                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ leeks                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ham                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ active_yeast         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ chickpeas            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ sausages             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ cajun                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ parmigiano_cheese    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ enchilada_sauce      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ sherry               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ pasta                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ coriander_seeds      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ dill                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ cocoa                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ sake                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ pinenuts             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ corn_meal            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ spaghetti            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ chiles               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ chile_pepper         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ fennel_seeds         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ tomatillos           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ turkey               <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ baguette             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ hot_pepper_sauce     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,...
## $ mustard_seeds        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ lamb                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ okra                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ pineapple            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ prosciutto           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ sriracha             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ melted_butter        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ serrano_chile        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ peppers              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ creole               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,...
## $ yoghurt              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ghee                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ walnuts              <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ peaches              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ radishes             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ beer                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ lasagna_noodles      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ pasta_sauce          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ grits                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ asparagus            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ corn_syrup           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ mango                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ shaoxing_wine        <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ saffron_threads      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ vegetables           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ star_anise           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ lentils              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ yeast                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ broccoli             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ chinese_fivespice    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ marinara_sauce       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ linguine             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ shortening           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ noodles              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ hot_pepper           <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ margarine            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ cardamom_pods        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ tarragon             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ strawberries         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ coconut_oil          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ bananas              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ chipotle_chile       <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ bosque               <fctr> chinese, indian, mexican, italian, chine...
## $ svm                  <fctr> chinese, indian, italian, italian, chine...

Kaggle Submission

test_raw <- fromJSON("data/test.json") 
test_raw <- test_raw %>% as.data.frame
test <- test_raw
# test_raw$ingredients %>% unlist
test$ingredients <- sapply(1:nrow(test_raw), function(x) {
  test[x,2] %>% 
    unlist %>%
    tolower %>%
    gsub("\\([^)]*\\)", "", ., perl = T, ignore.case = T) %>%
    gsub("[^ a-z]", "", ., perl = T, ignore.case = T) %>%
    gsub("^\\s+", " ", ., perl = T, ignore.case = T) %>%
    gsub(unimportant_words, " ", ., perl = T, ignore.case = T) %>%
    gsub("\\s+", " ", ., perl = T, ignore.case = T) %>%
    unique %>%
    trimws %>%
    gsub("\\s", "_", ., perl = T, ignore.case = T) %>%
    popular_words
})
test
test_2 <- test[,1] %>% data_frame(id=.)
#crea las columnas
test_2[, ingredients_top$ingredients] <- 0
#llena las columnas
for(i in 1:nrow(test_2)){
  test_2[i, which(names(test_2) %in% (test[i,]$ingredients %>% unlist))] <- 1
}
test_2
test_2$cuisine <- predict(svm, newdata = test_2[,1:209], type="class")
test_2 %>% select(id,cuisine) %>% summary
##        id               cuisine    
##  Min.   :    5   italian    :2645  
##  1st Qu.:12541   mexican    :1631  
##  Median :24732   southern_us:1614  
##  Mean   :24894   indian     : 875  
##  3rd Qu.:37113   chinese    : 869  
##  Max.   :49715   french     : 708  
##                  (Other)    :1602
test_2 %>% select(id, cuisine) %>% write_csv(., "data/svm_submission.csv")

Kaggle Submission :-(